{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "import gym\n",
    "import itertools\n",
    "import matplotlib\n",
    "import numpy as np\n",
    "import sys\n",
    "import tensorflow as tf\n",
    "import collections\n",
    "\n",
    "import sklearn.pipeline\n",
    "import sklearn.preprocessing\n",
    "\n",
    "if \"../\" not in sys.path:\n",
    "  sys.path.append(\"../\") \n",
    "from lib.envs.cliff_walking import CliffWalkingEnv\n",
    "from lib import plotting\n",
    "\n",
    "from sklearn.kernel_approximation import RBFSampler\n",
    "\n",
    "matplotlib.style.use('ggplot')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-06-16 13:11:05,265] Making new env: MountainCarContinuous-v0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "array([-0.21213569,  0.03012651])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "env = gym.envs.make(\"MountainCarContinuous-v0\")\n",
    "env.observation_space.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "FeatureUnion(n_jobs=1,\n",
       "       transformer_list=[('rbf1', RBFSampler(gamma=5.0, n_components=100, random_state=None)), ('rbf2', RBFSampler(gamma=2.0, n_components=100, random_state=None)), ('rbf3', RBFSampler(gamma=1.0, n_components=100, random_state=None)), ('rbf4', RBFSampler(gamma=0.5, n_components=100, random_state=None))],\n",
       "       transformer_weights=None)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Feature Preprocessing: Normalize to zero mean and unit variance\n",
    "# We use a few samples from the observation space to do this\n",
    "observation_examples = np.array([env.observation_space.sample() for x in range(10000)])\n",
    "scaler = sklearn.preprocessing.StandardScaler()\n",
    "scaler.fit(observation_examples)\n",
    "\n",
    "# Used to converte a state to a featurizes represenation.\n",
    "# We use RBF kernels with different variances to cover different parts of the space\n",
    "featurizer = sklearn.pipeline.FeatureUnion([\n",
    "        (\"rbf1\", RBFSampler(gamma=5.0, n_components=100)),\n",
    "        (\"rbf2\", RBFSampler(gamma=2.0, n_components=100)),\n",
    "        (\"rbf3\", RBFSampler(gamma=1.0, n_components=100)),\n",
    "        (\"rbf4\", RBFSampler(gamma=0.5, n_components=100))\n",
    "        ])\n",
    "featurizer.fit(scaler.transform(observation_examples))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def featurize_state(state):\n",
    "    \"\"\"\n",
    "    Returns the featurized representation for a state.\n",
    "    \"\"\"\n",
    "    scaled = scaler.transform([state])\n",
    "    featurized = featurizer.transform(scaled)\n",
    "    return featurized[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class PolicyEstimator():\n",
    "    \"\"\"\n",
    "    Policy Function approximator. \n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self, learning_rate=0.01, scope=\"policy_estimator\"):\n",
    "        with tf.variable_scope(scope):\n",
    "            self.state = tf.placeholder(tf.float32, [400], \"state\")\n",
    "            self.target = tf.placeholder(dtype=tf.float32, name=\"target\")\n",
    "\n",
    "            # This is just linear classifier\n",
    "            self.mu = tf.contrib.layers.fully_connected(\n",
    "                inputs=tf.expand_dims(self.state, 0),\n",
    "                num_outputs=1,\n",
    "                activation_fn=None,\n",
    "                weights_initializer=tf.zeros_initializer)\n",
    "            self.mu = tf.squeeze(self.mu)\n",
    "            \n",
    "            self.sigma = tf.contrib.layers.fully_connected(\n",
    "                inputs=tf.expand_dims(self.state, 0),\n",
    "                num_outputs=1,\n",
    "                activation_fn=None,\n",
    "                weights_initializer=tf.zeros_initializer)\n",
    "            \n",
    "            self.sigma = tf.squeeze(self.sigma)\n",
    "            self.sigma = tf.nn.softplus(self.sigma) + 1e-5\n",
    "            self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)\n",
    "            self.action = self.normal_dist._sample_n(1)\n",
    "            self.action = tf.clip_by_value(self.action, env.action_space.low[0], env.action_space.high[0])\n",
    "\n",
    "            # Loss and train op\n",
    "            self.loss = -self.normal_dist.log_prob(self.action) * self.target\n",
    "            # Add cross entropy cost to encourage exploration\n",
    "            self.loss -= 1e-1 * self.normal_dist.entropy()\n",
    "            \n",
    "            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
    "            self.train_op = self.optimizer.minimize(\n",
    "                self.loss, global_step=tf.contrib.framework.get_global_step())\n",
    "    \n",
    "    def predict(self, state, sess=None):\n",
    "        sess = sess or tf.get_default_session()\n",
    "        state = featurize_state(state)\n",
    "        return sess.run(self.action, { self.state: state })\n",
    "\n",
    "    def update(self, state, target, action, sess=None):\n",
    "        sess = sess or tf.get_default_session()\n",
    "        state = featurize_state(state)\n",
    "        feed_dict = { self.state: state, self.target: target, self.action: action  }\n",
    "        _, loss = sess.run([self.train_op, self.loss], feed_dict)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "class ValueEstimator():\n",
    "    \"\"\"\n",
    "    Value Function approximator. \n",
    "    \"\"\"\n",
    "    \n",
    "    def __init__(self, learning_rate=0.1, scope=\"value_estimator\"):\n",
    "        with tf.variable_scope(scope):\n",
    "            self.state = tf.placeholder(tf.float32, [400], \"state\")\n",
    "            self.target = tf.placeholder(dtype=tf.float32, name=\"target\")\n",
    "\n",
    "            # This is just linear classifier\n",
    "            self.output_layer = tf.contrib.layers.fully_connected(\n",
    "                inputs=tf.expand_dims(self.state, 0),\n",
    "                num_outputs=1,\n",
    "                activation_fn=None,\n",
    "                weights_initializer=tf.zeros_initializer)\n",
    "\n",
    "            self.value_estimate = tf.squeeze(self.output_layer)\n",
    "            self.loss = tf.squared_difference(self.value_estimate, self.target)\n",
    "\n",
    "            self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
    "            self.train_op = self.optimizer.minimize(\n",
    "                self.loss, global_step=tf.contrib.framework.get_global_step())        \n",
    "    \n",
    "    def predict(self, state, sess=None):\n",
    "        sess = sess or tf.get_default_session()\n",
    "        state = featurize_state(state)\n",
    "        return sess.run(self.value_estimate, { self.state: state })\n",
    "\n",
    "    def update(self, state, target, sess=None):\n",
    "        sess = sess or tf.get_default_session()\n",
    "        state = featurize_state(state)\n",
    "        feed_dict = { self.state: state, self.target: target }\n",
    "        _, loss = sess.run([self.train_op, self.loss], feed_dict)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def actor_critic(env, estimator_policy, estimator_value, num_episodes, discount_factor=1.0):\n",
    "    \"\"\"\n",
    "    Actor Critic Algorithm. Optimizes the policy \n",
    "    function approximator using policy gradient.\n",
    "    \n",
    "    Args:\n",
    "        env: OpenAI environment.\n",
    "        estimator_policy: Policy Function to be optimized \n",
    "        estimator_value: Value function approximator, used as a critic\n",
    "        num_episodes: Number of episodes to run for\n",
    "        discount_factor: Time-discount factor\n",
    "    \n",
    "    Returns:\n",
    "        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.\n",
    "    \"\"\"\n",
    "\n",
    "    # Keeps track of useful statistics\n",
    "    stats = plotting.EpisodeStats(\n",
    "        episode_lengths=np.zeros(num_episodes),\n",
    "        episode_rewards=np.zeros(num_episodes))    \n",
    "    \n",
    "    Transition = collections.namedtuple(\"Transition\", [\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
    "    \n",
    "    for i_episode in range(num_episodes):\n",
    "        # Reset the environment and pick the fisrst action\n",
    "        state = env.reset()\n",
    "        \n",
    "        episode = []\n",
    "        \n",
    "        # One step in the environment\n",
    "        for t in itertools.count():\n",
    "            \n",
    "            # env.render()\n",
    "            \n",
    "            # Take a step\n",
    "            action = estimator_policy.predict(state)\n",
    "            next_state, reward, done, _ = env.step(action)\n",
    "            \n",
    "            # Keep track of the transition\n",
    "            episode.append(Transition(\n",
    "              state=state, action=action, reward=reward, next_state=next_state, done=done))\n",
    "            \n",
    "            # Update statistics\n",
    "            stats.episode_rewards[i_episode] += reward\n",
    "            stats.episode_lengths[i_episode] = t\n",
    "            \n",
    "            # Calculate TD Target\n",
    "            value_next = estimator_value.predict(next_state)\n",
    "            td_target = reward + discount_factor * value_next\n",
    "            td_error = td_target - estimator_value.predict(state)\n",
    "            \n",
    "            # Update the value estimator\n",
    "            estimator_value.update(state, td_target)\n",
    "            \n",
    "            # Update the policy estimator\n",
    "            # using the td error as our advantage estimate\n",
    "            estimator_policy.update(state, td_error, action)\n",
    "            \n",
    "            # Print out which step we're on, useful for debugging.\n",
    "            print(\"\\rStep {} @ Episode {}/{} ({})\".format(\n",
    "                    t, i_episode + 1, num_episodes, stats.episode_rewards[i_episode - 1]), end=\"\")\n",
    "\n",
    "            if done:\n",
    "                break\n",
    "                \n",
    "            state = next_state\n",
    "    \n",
    "    return stats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n",
      "Instructions for updating:\n",
      "Use `tf.global_variables_initializer` instead.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[2017-06-16 13:31:05,772] From /Users/dennybritz/venv/py3/lib/python3.6/site-packages/tensorflow/python/util/tf_should_use.py:170: initialize_all_variables (from tensorflow.python.ops.variables) is deprecated and will be removed after 2017-03-02.\n",
      "Instructions for updating:\n",
      "Use `tf.global_variables_initializer` instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Step 662 @ Episode 50/50 (65.13252566564918))"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "\n",
    "global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n",
    "policy_estimator = PolicyEstimator(learning_rate=0.001)\n",
    "value_estimator = ValueEstimator(learning_rate=0.1)\n",
    "\n",
    "with tf.Session() as sess:\n",
    "    sess.run(tf.initialize_all_variables())\n",
    "    # Note, due to randomness in the policy the number of episodes you need varies\n",
    "    # TODO: Sometimes the algorithm gets stuck, I'm not sure what exactly is happening there.\n",
    "    stats = actor_critic(env, policy_estimator, value_estimator, 50, discount_factor=0.95)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plotting.plot_episode_stats(stats, smoothing_window=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
